In [1]:
import pandas as pd

In [2]:
sales = pd.DataFrame(
 {
 'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],
 'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],
 'bread': [139, 237, 326, 456],
 'butter': [20, 45, 70, 98]
 }
)

In [3]:
sales.groupby('weekday').count()


Out[3]:
bread butter city
weekday
Mon 2 2 2
Sun 2 2 2

In [4]:
sales.groupby('weekday')['city'].count()


Out[4]:
weekday
Mon    2
Sun    2
Name: city, dtype: int64

In [5]:
sales.groupby('weekday')[['city','butter']].count()


Out[5]:
city butter
weekday
Mon 2 2
Sun 2 2

In [6]:
sales.groupby(['weekday','city'])['butter'].mean()


Out[6]:
weekday  city  
Mon      Austin    70
         Dallas    98
Sun      Austin    20
         Dallas    45
Name: butter, dtype: int64

In [7]:
#Groupby by custom series
customers = pd.Series(['Dave','Alice','Bob','Alice'])
sales.groupby(customers)['bread'].sum()


Out[7]:
Alice    693
Bob      326
Dave     139
Name: bread, dtype: int64

In [8]:
#Groupby is more efficient (less memory and more speed) with categorical column
sales.dtypes


Out[8]:
bread       int64
butter      int64
city       object
weekday    object
dtype: object

In [9]:
sales['weekday'] = sales['weekday'].astype('category')
sales.dtypes


Out[9]:
bread         int64
butter        int64
city         object
weekday    category
dtype: object

In [10]:
sales.groupby('weekday').sum()


Out[10]:
bread butter
weekday
Mon 782 168
Sun 376 65

Group by and aggregation


In [11]:
sales.groupby('city')[['bread','butter']].agg(['max','sum'])


Out[11]:
bread butter
max sum max sum
city
Austin 326 465 70 90
Dallas 456 693 98 143

In [12]:
# custom aggregation
def data_range(series):
    return series.max() - series.min()

sales.groupby('weekday')[['bread', 'butter']].agg(data_range)


Out[12]:
bread butter
weekday
Mon 130 28
Sun 98 25

In [13]:
sales.groupby(customers)[['bread', 'butter']].agg({'bread':'sum', 'butter':data_range})


Out[13]:
bread butter
Alice 693 53
Bob 326 0
Dave 139 0

Group by and Transformation


In [14]:
def zscore(series):
    return (series - series.mean()) / series.std()

In [15]:
#!pip install pydataset
from pydataset import data

In [16]:
boston = data('Boston')
boston['id']=boston.index
boston.head()


Out[16]:
crim zn indus chas nox rm age dis rad tax ptratio black lstat medv
1 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 24.0
2 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 21.6
3 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 34.7
4 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94 33.4
5 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33 36.2

In [27]:
zscore(boston['rm']).head()


Out[27]:
1    0.413263
2    0.194082
3    1.281446
4    1.015298
5    1.227362
Name: rm, dtype: float64

In [28]:
len(zscore(boston['rm']))


Out[28]:
506

In [29]:
boston.groupby('rad')['rm'].transform(zscore).head()


Out[29]:
1    0.036211
2   -0.324529
3    0.758377
4    0.813555
5    1.044462
Name: rm, dtype: float64

In [30]:
len(boston.groupby('rad')['rm'].transform(zscore))


Out[30]:
506

Apply transformation and aggregation


In [31]:
def zscore_with_indus_and_id(group):
    df = pd.DataFrame(
        {'rm': zscore(group['rm']),
         'rad': group['rad'],
         'id': group['id']})
    return df

In [33]:
boston.groupby('rad').apply(zscore_with_indus_and_id).head()


Out[33]:
id rad rm
1 1 1 0.036211
2 2 2 -0.324529
3 3 2 0.758377
4 4 3 0.813555
5 5 3 1.044462

Groupby Object


In [34]:
splitting = boston.groupby('rad')
print(type(splitting))
print(type(splitting.groups))
print(splitting.groups.keys())


<class 'pandas.core.groupby.DataFrameGroupBy'>
<class 'dict'>
dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 24])

In [38]:
splitting.groups


Out[38]:
{1: Int64Index([  1, 194, 195, 255, 256, 284, 285, 286, 287, 332, 333, 342, 343,
             350, 351, 502, 503, 504, 505, 506],
            dtype='int64'),
 2: Int64Index([  2,   3,  57,  89,  90,  91,  92,  96,  97,  98,  99, 100, 121,
             122, 123, 124, 125, 126, 127, 197, 198, 199, 202, 203],
            dtype='int64'),
 3: Int64Index([  4,   5,   6,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,
              50,  55,  65,  85,  86,  87,  88, 180, 181, 182, 183, 184, 185,
             186, 187, 200, 201, 257, 270, 271, 272, 273, 274, 346, 347],
            dtype='int64'),
 4: Int64Index([ 14,  15,  16,  17,  18,  19,  20,  21,  22,  23,
             ...
             349, 352, 353, 355, 356, 489, 490, 491, 492, 493],
            dtype='int64', length=110),
 5: Int64Index([  7,   8,   9,  10,  11,  12,  13,  36,  37,  38,
             ...
             335, 336, 337, 338, 339, 340, 341, 344, 345, 354],
            dtype='int64', length=115),
 6: Int64Index([112, 113, 114, 115, 116, 117, 118, 119, 120, 239, 240, 241, 242,
             243, 244, 288, 289, 290, 494, 495, 496, 497, 498, 499, 500, 501],
            dtype='int64'),
 7: Int64Index([245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 302, 303, 304,
             305, 306, 307, 308],
            dtype='int64'),
 8: Int64Index([ 59,  60,  61,  62,  63,  64, 221, 222, 223, 224, 225, 226, 227,
             228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238],
            dtype='int64'),
 24: Int64Index([357, 358, 359, 360, 361, 362, 363, 364, 365, 366,
             ...
             479, 480, 481, 482, 483, 484, 485, 486, 487, 488],
            dtype='int64', length=132)}

In [42]:
### Group object iteration
for group_name, group in splitting:
    avg= group['age'].mean()
    print(group_name,avg)


1 45.025
2 64.77083333333333
3 49.31052631578947
4 60.84181818181818
5 69.22347826086958
6 60.14230769230768
7 40.14117647058823
8 67.35
24 89.80530303030304

In [43]:
boston.groupby('rad')['age'].mean()


Out[43]:
rad
1     45.025000
2     64.770833
3     49.310526
4     60.841818
5     69.223478
6     60.142308
7     40.141176
8     67.350000
24    89.805303
Name: age, dtype: float64

Group object: iteration and filtering


In [57]:
for group_name, group in splitting:
    avg= group.loc[group['zn']==0,'age'].mean()
    print(group_name,avg)


1 77.76666666666667
2 75.31666666666666
3 55.20384615384614
4 72.01298701298701
5 73.86410256410257
6 71.94705882352942
7 nan
8 70.5111111111111
24 89.80530303030304

In [60]:
{zn: group.loc[group['zn']==0,'age'].mean() for zn, group in splitting}


Out[60]:
{1: 77.76666666666667,
 2: 75.31666666666666,
 3: 55.20384615384614,
 4: 72.01298701298701,
 5: 73.86410256410257,
 6: 71.94705882352942,
 7: nan,
 8: 70.5111111111111,
 24: 89.80530303030304}

Boolean groupby


In [65]:
filt= boston['zn']==0
res=boston.groupby(['rad',filt])['age'].mean()
res


Out[65]:
rad  zn   
1    False    30.992857
     True     77.766667
2    False    33.133333
     True     75.316667
3    False    36.541667
     True     55.203846
4    False    34.775758
     True     72.012987
5    False    59.440541
     True     73.864103
6    False    37.844444
     True     71.947059
7    False    40.141176
8    False    57.866667
     True     70.511111
24   True     89.805303
Name: age, dtype: float64

In [69]:
res.unstack(-1)


Out[69]:
zn False True
rad
1 30.992857 77.766667
2 33.133333 75.316667
3 36.541667 55.203846
4 34.775758 72.012987
5 59.440541 73.864103
6 37.844444 71.947059
7 40.141176 NaN
8 57.866667 70.511111
24 NaN 89.805303

In [ ]: